bitkeeper revision 1.1159.184.1 (41a226e3Y9RHKGkAbgRWDb4t7yHQPQ)
authorakw27@labyrinth.cl.cam.ac.uk <akw27@labyrinth.cl.cam.ac.uk>
Mon, 22 Nov 2004 17:50:27 +0000 (17:50 +0000)
committerakw27@labyrinth.cl.cam.ac.uk <akw27@labyrinth.cl.cam.ac.uk>
Mon, 22 Nov 2004 17:50:27 +0000 (17:50 +0000)
Initial push of the block tap code.  This is a driver to let you
intercept block requests and/or implement block devices in user space,
all in an isolated VM.

13 files changed:
.rootkeys
linux-2.6.9-xen-sparse/arch/xen/Kconfig
linux-2.6.9-xen-sparse/arch/xen/configs/xen0_defconfig
linux-2.6.9-xen-sparse/arch/xen/configs/xenU_defconfig
linux-2.6.9-xen-sparse/drivers/xen/Makefile
linux-2.6.9-xen-sparse/drivers/xen/blkback/blkback.c
linux-2.6.9-xen-sparse/drivers/xen/blkfront/blkfront.c
linux-2.6.9-xen-sparse/drivers/xen/blktap/Makefile [new file with mode: 0644]
linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap.c [new file with mode: 0644]
linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap.h [new file with mode: 0644]
linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c [new file with mode: 0644]
linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap_datapath.c [new file with mode: 0644]
linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap_userdev.c [new file with mode: 0644]

index aa50d08537366adfcf90d5622c5594c4b25f90bb..09e1b67be9c9fa8cc51440a1f3e809e7d2609805 100644 (file)
--- a/.rootkeys
+++ b/.rootkeys
 40f56239-JNIaTzlviVJohVdoYOUpw linux-2.6.9-xen-sparse/drivers/xen/blkfront/blkfront.c
 40f56239y9naBTXe40Pi2J_z3p-d1g linux-2.6.9-xen-sparse/drivers/xen/blkfront/block.h
 40f56239BVfPsXBiWQitXgDRtOsiqg linux-2.6.9-xen-sparse/drivers/xen/blkfront/vbd.c
+41a226e0vjAcDXHOnXE5ummcdUD2mg linux-2.6.9-xen-sparse/drivers/xen/blktap/Makefile
+41a226e0VeZA1N8tbU6nvJ3OxUcJmw linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap.c
+41a226e1k4J5VMLnrYXDWRqElS49YQ linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap.h
+41a226e1-A_Hy7utS8vJKaXnH_tzfA linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c
+41a226e19NoUUTOvs7jumDMRYDIO4Q linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap_datapath.c
+41a226e1MNSyWWK5dEVgvSQ5OW0fDA linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap_userdev.c
 40f56239fsLjvtD8YBRAWphps4FDjg linux-2.6.9-xen-sparse/drivers/xen/console/Makefile
 3e5a4e651TH-SXHoufurnWjgl5bfOA linux-2.6.9-xen-sparse/drivers/xen/console/console.c
 40f56239KYxO0YabhPzCTeUuln-lnA linux-2.6.9-xen-sparse/drivers/xen/evtchn/Makefile
index a9675229ae5c64d96fc74576e76939fd5d3fa019..d520aefe17c0e05a989d20e08662622a422538dc 100644 (file)
@@ -49,6 +49,20 @@ config XEN_BLKDEV_BACKEND
           block devices to other guests via a high-performance shared-memory
           interface.
 
+if XEN_BLKDEV_BACKEND
+config XEN_BLKDEV_TAP_BE
+        bool "Block Tap support for backend driver (DANGEROUS)"
+        default n
+        help
+          If you intend to use the block tap driver, the backend domain will
+          not know the domain id of the real frontend, and so will not be able
+          to map its data pages.  This modifies the backend to attempt to map
+          from both the tap domain and the real frontend.  This presents a
+          security risk, and so should ONLY be used for development
+          with the blktap.  This option will be removed as the block drivers are
+          modified to use grant tables.
+endif
+
 config XEN_NETDEV_BACKEND
         bool "Network-device backend driver"
         default y if XEN_PHYSDEV_ACCESS
@@ -94,6 +108,16 @@ config XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER
           enabled; then you must say N here.
 endif
 
+config XEN_BLKDEV_TAP
+        bool "Block device tap driver"
+        default n
+        help
+          This driver allows a VM to interact on block device channels
+          to other VMs.  Block messages may be passed through or redirected
+          to a character device, allowing device prototyping in application
+          space.  Odds are that you want to say N here.
+
+
 config XEN_WRITABLE_PAGETABLES
        bool
        default y
index 1532ab3dfb7eebeb7087e465e3a4c84de3569054..1455a24d5fe988d27324c0a5b8d053b6a2317bc7 100644 (file)
@@ -13,9 +13,11 @@ CONFIG_NO_IDLE_HZ=y
 CONFIG_XEN_PRIVILEGED_GUEST=y
 CONFIG_XEN_PHYSDEV_ACCESS=y
 CONFIG_XEN_BLKDEV_BACKEND=y
+# CONFIG_XEN_BLKDEV_TAP_BE is not set
 CONFIG_XEN_NETDEV_BACKEND=y
 CONFIG_XEN_BLKDEV_FRONTEND=y
 CONFIG_XEN_NETDEV_FRONTEND=y
+# CONFIG_XEN_BLKDEV_TAP is not set
 # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set
 CONFIG_XEN_WRITABLE_PAGETABLES=y
 CONFIG_XEN_SCRUB_PAGES=y
index 24c57a3f50059348e61d27e271afea91aa65eaab..50d61f79401f7b7d88fa3fb331693b91b8c0e34e 100644 (file)
@@ -13,9 +13,11 @@ CONFIG_NO_IDLE_HZ=y
 # CONFIG_XEN_PRIVILEGED_GUEST is not set
 # CONFIG_XEN_PHYSDEV_ACCESS is not set
 # CONFIG_XEN_BLKDEV_BACKEND is not set
+# CONFIG_XEN_BLKDEV_TAP_BE is not set
 # CONFIG_XEN_NETDEV_BACKEND is not set
 CONFIG_XEN_BLKDEV_FRONTEND=y
 CONFIG_XEN_NETDEV_FRONTEND=y
+# CONFIG_XEN_BLKDEV_TAP is not set
 # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set
 CONFIG_XEN_WRITABLE_PAGETABLES=y
 CONFIG_XEN_SCRUB_PAGES=y
index e181171a61a50bd713b0b7afcdeadd98b133a908..8728d6a725c323b45e62fc24368e7d7bc8213071 100644 (file)
@@ -9,4 +9,5 @@ obj-$(CONFIG_XEN_BLKDEV_BACKEND)        += blkback/
 obj-$(CONFIG_XEN_NETDEV_BACKEND)       += netback/
 obj-$(CONFIG_XEN_BLKDEV_FRONTEND)      += blkfront/
 obj-$(CONFIG_XEN_NETDEV_FRONTEND)      += netfront/
+obj-$(CONFIG_XEN_BLKDEV_TAP)           += blktap/
 
index 6d201022d25269929e6d27bcff39e43fe664a1d2..ce5b01037568681b84bc09a603a6d54024c36294 100644 (file)
@@ -68,6 +68,19 @@ static PEND_RING_IDX pending_prod, pending_cons;
 static kmem_cache_t *buffer_head_cachep;
 #endif
 
+#ifdef CONFIG_XEN_BLKDEV_TAP_BE
+/*
+ * If the tap driver is used, we may get pages belonging to either the tap
+ * or (more likely) the real frontend.  The backend must specify which domain
+ * a given page belongs to in update_va_mapping though.  For the moment, 
+ * we pass in the domid of the real frontend in PROBE messages and store 
+ * this value in alt_dom.  Then on mapping, we try both.  This is a Guiness 
+ * book of records-calibre grim hack, and represents a bit of a security risk.
+ * Grant tables will soon solve the problem though!
+ */
+static domid_t alt_dom = 0;
+#endif
+
 static int do_block_io_op(blkif_t *blkif, int max_to_do);
 static void dispatch_probe(blkif_t *blkif, blkif_request_t *req);
 static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
@@ -323,12 +336,27 @@ static void dispatch_probe(blkif_t *blkif, blkif_request_t *req)
          (blkif_last_sect(req->frame_and_sects[0]) != 7) )
         goto out;
 
+#ifdef CONFIG_XEN_BLKDEV_TAP_BE
+    /* Grab the real frontend out of the probe message. */
+    alt_dom = (domid_t)req->frame_and_sects[1];
+#endif
+    
     if ( HYPERVISOR_update_va_mapping_otherdomain(
         MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT,
         (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
-        0, blkif->domid) )
+        0, blkif->domid) ) {
+#ifdef CONFIG_XEN_BLKDEV_TAP_BE
+        /* That didn't work.  Try alt_dom. */
+        if ( HYPERVISOR_update_va_mapping_otherdomain(
+            MMAP_VADDR(pending_idx, 0) >> PAGE_SHIFT,
+            (pte_t) { (req->frame_and_sects[0] & PAGE_MASK) | __PAGE_KERNEL },
+            0, alt_dom) )
+            goto out;
+#else  
         goto out;
-
+#endif
+    }
+    
     rsp = vbd_probe(blkif, (vdisk_t *)MMAP_VADDR(pending_idx, 0), 
                     PAGE_SIZE / sizeof(vdisk_t));
 
@@ -411,8 +439,11 @@ static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
         mcl[i].args[0] = MMAP_VADDR(pending_idx, i) >> PAGE_SHIFT;
         mcl[i].args[1] = (phys_seg[i].buffer & PAGE_MASK) | remap_prot;
         mcl[i].args[2] = 0;
+#ifdef CONFIG_XEN_BLKDEV_TAP_BE
+        mcl[i].args[3] = (alt_dom != 0) ? alt_dom : blkif->domid;
+#else
         mcl[i].args[3] = blkif->domid;
-
+#endif
         phys_to_machine_mapping[__pa(MMAP_VADDR(pending_idx, i))>>PAGE_SHIFT] =
             FOREIGN_FRAME(phys_seg[i].buffer >> PAGE_SHIFT);
     }
@@ -579,7 +610,10 @@ static int __init blkif_init(void)
 #endif
 
     blkif_ctrlif_init();
-
+    
+#ifdef CONFIG_XEN_BLKDEV_TAP_BE
+    printk(KERN_ALERT "NOTE: Blkif backend is running with tap support on!\n");
+#endif
     return 0;
 }
 
index 33840996412eba0995d4383eb251ab454c037a22..2f792083212ceeb3a1be4b7a0c3a8483377cf2c8 100644 (file)
@@ -1262,7 +1262,8 @@ static void blkif_status(blkif_fe_interface_status_t *status)
 {
     if ( status->handle != blkif_handle )
     {
-        WPRINTK(" Invalid blkif: handle=%u", status->handle);
+        WPRINTK(" Invalid blkif: handle=%u\n", status->handle);
+        unexpected(status);
         return;
     }
 
diff --git a/linux-2.6.9-xen-sparse/drivers/xen/blktap/Makefile b/linux-2.6.9-xen-sparse/drivers/xen/blktap/Makefile
new file mode 100644 (file)
index 0000000..80b7ca0
--- /dev/null
@@ -0,0 +1,3 @@
+
+obj-y  := blktap_userdev.o blktap_datapath.o blktap_controlmsg.o blktap.o 
+
diff --git a/linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap.c b/linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap.c
new file mode 100644 (file)
index 0000000..5e7d47c
--- /dev/null
@@ -0,0 +1,86 @@
+/******************************************************************************
+ * blktap.c
+ * 
+ * XenLinux virtual block-device tap.
+ * 
+ * Copyright (c) 2004, Andrew Warfield
+ *
+ * Based on the original split block driver:
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004, Christian Limpach
+ * 
+ * Note that unlike the split block driver code, this driver has been developed
+ * strictly for Linux 2.6
+ */
+
+#include "blktap.h"
+
+int __init xlblk_init(void)
+{
+    ctrl_msg_t               cmsg;
+    blkif_fe_driver_status_t fe_st;
+    blkif_be_driver_status_t be_st;
+
+    printk(KERN_INFO "Initialising Xen block tap device\n");
+
+    DPRINTK("   tap - Backend connection init:\n");
+
+
+    (void)ctrl_if_register_receiver(CMSG_BLKIF_FE, blkif_ctrlif_rx,
+                                    CALLBACK_IN_BLOCKING_CONTEXT);
+
+    /* Send a driver-UP notification to the domain controller. */
+    cmsg.type      = CMSG_BLKIF_FE;
+    cmsg.subtype   = CMSG_BLKIF_FE_DRIVER_STATUS;
+    cmsg.length    = sizeof(blkif_fe_driver_status_t);
+    fe_st.status   = BLKIF_DRIVER_STATUS_UP;
+    memcpy(cmsg.msg, &fe_st, sizeof(fe_st));
+    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+
+    DPRINTK("   tap - Frontend connection init:\n");
+    
+    active_reqs_init();
+    
+    ptfe_blkif.status = DISCONNECTED;
+
+    (void)ctrl_if_register_receiver(CMSG_BLKIF_BE, blkif_ctrlif_rx, 
+                                    CALLBACK_IN_BLOCKING_CONTEXT);
+
+    /* Send a driver-UP notification to the domain controller. */
+    cmsg.type      = CMSG_BLKIF_BE;
+    cmsg.subtype   = CMSG_BLKIF_BE_DRIVER_STATUS;
+    cmsg.length    = sizeof(blkif_be_driver_status_t);
+    be_st.status   = BLKIF_DRIVER_STATUS_UP;
+    memcpy(cmsg.msg, &be_st, sizeof(be_st));
+    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+
+    DPRINTK("   tap - Userland channel init:\n");
+
+    blktap_init();
+
+    DPRINTK("Blkif tap device initialized.\n");
+
+    return 0;
+}
+
+void blkdev_suspend(void)
+{
+}
+
+void blkdev_resume(void)
+{
+    ctrl_msg_t               cmsg;
+    blkif_fe_driver_status_t st;    
+
+    /* Send a driver-UP notification to the domain controller. */
+    cmsg.type      = CMSG_BLKIF_FE;
+    cmsg.subtype   = CMSG_BLKIF_FE_DRIVER_STATUS;
+    cmsg.length    = sizeof(blkif_fe_driver_status_t);
+    st.status      = BLKIF_DRIVER_STATUS_UP;
+    memcpy(cmsg.msg, &st, sizeof(st));
+    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+}
+
+
+__initcall(xlblk_init);
diff --git a/linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap.h b/linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap.h
new file mode 100644 (file)
index 0000000..7e5d73d
--- /dev/null
@@ -0,0 +1,254 @@
+/*
+ * blktap.h
+ * 
+ * Interfaces for the Xen block tap driver.
+ * 
+ * (c) 2004, Andrew Warfield, University of Cambridge
+ * 
+ */
+
+#ifndef __BLKTAP_H__
+#define __BLKTAP_H__
+
+#include <linux/version.h>
+#include <linux/blkdev.h>
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <asm-xen/ctrl_if.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <asm/io.h>
+#include <asm/setup.h>
+#include <asm/pgalloc.h>
+#include <asm-xen/hypervisor.h>
+#include <asm-xen/xen-public/io/blkif.h>
+
+/* -------[ debug / pretty printing ]--------------------------------- */
+
+#if 0
+#define ASSERT(_p) \
+    if ( !(_p) ) { printk("Assertion '%s' failed, line %d, file %s", #_p , \
+    __LINE__, __FILE__); *(int*)0=0; }
+#define DPRINTK(_f, _a...) printk(KERN_ALERT "(file=%s, line=%d) " _f, \
+                           __FILE__ , __LINE__ , ## _a )
+#else
+#define ASSERT(_p) ((void)0)
+#define DPRINTK(_f, _a...) ((void)0)
+#endif
+
+#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args)
+
+/* -------[ connection / request tracking ]--------------------------- */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#define VMALLOC_VMADDR(x) ((unsigned long)(x))
+#endif
+
+extern spinlock_t blkif_io_lock;
+
+typedef struct blkif_st {
+    /* Unique identifier for this interface. */
+    domid_t          domid;
+    unsigned int     handle;
+    /* Physical parameters of the comms window. */
+    unsigned long    shmem_frame;
+    unsigned int     evtchn;
+    int              irq;
+    /* Comms information. */
+    blkif_ring_t    *blk_ring_base; /* ioremap()'ed ptr to shmem_frame. */
+    BLKIF_RING_IDX     blk_req_cons;  /* Request consumer. */
+    BLKIF_RING_IDX     blk_resp_prod; /* Private version of resp. producer. */
+    
+    enum { DISCONNECTED, DISCONNECTING, CONNECTED } status;
+    /*
+     * DISCONNECT response is deferred until pending requests are ack'ed.
+     * We therefore need to store the id from the original request.
+     */    u8               disconnect_rspid;
+    struct blkif_st *hash_next;
+    struct list_head blkdev_list;
+    spinlock_t       blk_ring_lock;
+    atomic_t         refcnt;
+    
+    struct work_struct work;
+} blkif_t;
+
+typedef struct {
+    blkif_t       *blkif;
+    unsigned long  id;
+    int            nr_pages;
+    unsigned long  mach_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    unsigned long  virt_fas[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    int            next_free;
+} active_req_t;
+
+
+/* -------[ block ring structs ]-------------------------------------- */
+
+/* Types of ring. */
+#define BLKIF_REQ_RING_TYPE 1
+#define BLKIF_RSP_RING_TYPE 2
+
+/* generic ring struct. */
+typedef struct blkif_generic_ring_struct {
+    int type;
+} blkif_generic_ring_t;
+
+/* A requestor's view of a ring. */
+typedef struct blkif_req_ring_struct {
+
+    int type;                    /* Will be BLKIF_REQ_RING_TYPE        */
+    BLKIF_RING_IDX req_prod;     /* PRIVATE req_prod index             */
+    BLKIF_RING_IDX rsp_cons;     /* Response consumer index            */
+    blkif_ring_t *ring;          /* Pointer to shared ring struct      */
+
+} blkif_req_ring_t;
+
+#define BLKIF_REQ_RING_INIT { BLKIF_REQ_RING_TYPE, 0, 0, 0 }
+
+/* A responder's view of a ring. */
+typedef struct blkif_rsp_ring_struct {
+
+    int type;       
+    BLKIF_RING_IDX rsp_prod;     /* PRIVATE rsp_prod index             */
+    BLKIF_RING_IDX req_cons;     /* Request consumer index             */
+    blkif_ring_t *ring;          /* Pointer to shared ring struct      */
+
+} blkif_rsp_ring_t;
+
+#define BLKIF_RSP_RING_INIT = { BLKIF_RSP_RING_TYPE, 0, 0, 0 }
+
+#define RING(a) (blkif_generic_ring_t *)(a)
+
+inline int BLKTAP_RING_FULL(blkif_generic_ring_t *ring);
+
+
+/* -------[ interposition -> character device interface ]------------- */
+
+/* /dev/xen/blktap resides at device number major=10, minor=200        */ 
+#define BLKTAP_MINOR 202
+
+/* size of the extra VMA area to map in attached pages. */
+#define BLKTAP_VMA_PAGES BLKIF_RING_SIZE
+
+/* blktap IOCTLs:                                                      */
+#define BLKTAP_IOCTL_KICK_FE         1
+#define BLKTAP_IOCTL_KICK_BE         2
+#define BLKTAP_IOCTL_SETMODE         3
+
+/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE)             */
+#define BLKTAP_MODE_PASSTHROUGH      0x00000000  /* default            */
+#define BLKTAP_MODE_INTERCEPT_FE     0x00000001
+#define BLKTAP_MODE_INTERCEPT_BE     0x00000002
+#define BLKTAP_MODE_COPY_FE          0x00000004
+#define BLKTAP_MODE_COPY_BE          0x00000008
+#define BLKTAP_MODE_COPY_FE_PAGES    0x00000010
+#define BLKTAP_MODE_COPY_BE_PAGES    0x00000020
+
+#define BLKTAP_MODE_INTERPOSE \
+           (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE)
+
+#define BLKTAP_MODE_COPY_BOTH \
+           (BLKTAP_MODE_COPY_FE | BLKTAP_MODE_COPY_BE)
+
+#define BLKTAP_MODE_COPY_BOTH_PAGES \
+           (BLKTAP_MODE_COPY_FE_PAGES | BLKTAP_MODE_COPY_BE_PAGES)
+
+static inline int BLKTAP_MODE_VALID(unsigned long arg)
+{
+    return (
+        ( arg == BLKTAP_MODE_PASSTHROUGH  ) ||
+        ( arg == BLKTAP_MODE_INTERCEPT_FE ) ||
+        ( arg == BLKTAP_MODE_INTERCEPT_BE ) ||
+        ( arg == BLKTAP_MODE_INTERPOSE    ) ||
+        ( (arg & ~BLKTAP_MODE_COPY_FE_PAGES) == BLKTAP_MODE_COPY_FE ) ||
+        ( (arg & ~BLKTAP_MODE_COPY_BE_PAGES) == BLKTAP_MODE_COPY_BE ) ||
+        ( (arg & ~BLKTAP_MODE_COPY_BOTH_PAGES) == BLKTAP_MODE_COPY_BOTH )
+        );
+}
+
+
+
+/* -------[ Mappings to User VMA ]------------------------------------ */
+#define MAX_PENDING_REQS 64
+#define BATCH_PER_DOMAIN 16
+extern struct vm_area_struct *blktap_vma;
+
+/* The following are from blkback.c and should probably be put in a
+ * header and included from there.
+ * The mmap area described here is where attached data pages eill be mapped.
+ */
+extern unsigned long mmap_vstart;
+#define MMAP_PAGES_PER_REQUEST \
+    (BLKIF_MAX_SEGMENTS_PER_REQUEST + 1)
+#define MMAP_PAGES             \
+    (MAX_PENDING_REQS * MMAP_PAGES_PER_REQUEST)
+#define MMAP_VADDR(_req,_seg)                        \
+    (mmap_vstart +                                   \
+     ((_req) * MMAP_PAGES_PER_REQUEST * PAGE_SIZE) + \
+     ((_seg) * PAGE_SIZE))
+
+/* immediately before the mmap area, we have a bunch of pages reserved
+ * for shared memory rings.
+ */
+
+#define RING_PAGES 128 
+extern unsigned long rings_vstart;
+
+/* -------[ Here be globals ]----------------------------------------- */
+
+extern unsigned long blktap_mode;
+
+
+/* blkif struct, containing ring to FE domain */
+extern blkif_t ptfe_blkif; 
+
+/* Connection to a single backend domain. */
+extern blkif_ring_t *blk_ptbe_ring;   /* Ring from the PT to the BE dom    */ 
+extern BLKIF_RING_IDX ptbe_resp_cons; /* Response consumer for comms ring. */
+extern BLKIF_RING_IDX ptbe_req_prod;  /* Private request producer.         */
+
+/* Rings up to user space. */ 
+extern blkif_req_ring_t fe_ring;// = BLKIF_REQ_RING_INIT;
+extern blkif_rsp_ring_t be_ring;// = BLKIF_RSP_RING_INIT;
+
+/* Event channel to backend domain. */
+extern unsigned int blkif_ptbe_evtchn;
+
+/* User ring status... this will soon vanish into a ring struct. */
+extern unsigned long blktap_ring_ok;
+
+/* -------[ ...and function prototypes. ]----------------------------- */
+
+/* init function for character device interface.                       */
+int blktap_init(void);
+
+/* interfaces to the char driver, passing messages to and from apps.   */
+void blktap_kick_user(void);
+int blktap_write_to_ring(blkif_request_t *req);
+
+
+/* user ring access functions: */
+int blktap_write_fe_ring(blkif_request_t *req);
+int blktap_write_be_ring(blkif_response_t *rsp);
+int blktap_read_fe_ring(void);
+int blktap_read_be_ring(void);
+
+/* and the helpers they call: */
+inline int write_resp_to_fe_ring(blkif_response_t *rsp);
+inline void kick_fe_domain(void);
+
+inline int write_req_to_be_ring(blkif_request_t *req);
+inline void kick_be_domain(void);
+
+/* Interrupt handlers. */
+irqreturn_t blkif_ptbe_int(int irq, void *dev_id, 
+                                  struct pt_regs *ptregs);
+irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs);
+
+/* Control message receiver. */
+extern void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id);
+
+#define __BLKINT_H__
+#endif
diff --git a/linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c b/linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap_controlmsg.c
new file mode 100644 (file)
index 0000000..a3d485a
--- /dev/null
@@ -0,0 +1,358 @@
+/******************************************************************************
+ * blktap_controlmsg.c
+ * 
+ * XenLinux virtual block-device tap.
+ * Control interfaces to the frontend and backend drivers.
+ * 
+ * Copyright (c) 2004, Andrew Warfield
+ *
+ */
+#include "blktap.h"
+
+#define BLKIF_STATE_CLOSED       0
+#define BLKIF_STATE_DISCONNECTED 1
+#define BLKIF_STATE_CONNECTED    2
+
+static char *blkif_state_name[] = {
+    [BLKIF_STATE_CLOSED]       = "closed",
+    [BLKIF_STATE_DISCONNECTED] = "disconnected",
+    [BLKIF_STATE_CONNECTED]    = "connected",
+};
+
+static char * blkif_status_name[] = {
+    [BLKIF_INTERFACE_STATUS_CLOSED]       = "closed",
+    [BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected",
+    [BLKIF_INTERFACE_STATUS_CONNECTED]    = "connected",
+    [BLKIF_INTERFACE_STATUS_CHANGED]      = "changed",
+};
+static unsigned int blkif_pt_state = BLKIF_STATE_CLOSED;
+static unsigned blkif_ptbe_irq;
+unsigned int blkif_ptbe_evtchn;
+
+/*-----[ Control Messages to/from Frontend VMs ]--------------------------*/
+
+
+void blkif_ptfe_create(blkif_be_create_t *create)
+{
+    blkif_t      *blkif;
+    domid_t       domid  = create->domid;
+    unsigned int  handle = create->blkif_handle;
+
+
+    /* May want to store info on the connecting domain here. */
+
+    DPRINTK("PT got BE_CREATE\n");
+    blkif = &ptfe_blkif; /* for convenience if the hash is readded later. */
+
+    /* blkif struct init code from blkback.c */
+    memset(blkif, 0, sizeof(*blkif));
+    blkif->domid  = domid;
+    blkif->handle = handle;
+    blkif->status = DISCONNECTED;    
+    spin_lock_init(&blkif->blk_ring_lock);
+    atomic_set(&blkif->refcnt, 0);
+
+    create->status = BLKIF_BE_STATUS_OKAY;
+}
+
+
+void blkif_ptfe_destroy(blkif_be_destroy_t *destroy)
+{
+    /* Clear anything that we initialized above. */
+
+    DPRINTK("PT got BE_DESTROY\n");
+    destroy->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void blkif_ptfe_connect(blkif_be_connect_t *connect)
+{
+    domid_t       domid  = connect->domid;
+    /*unsigned int  handle = connect->blkif_handle;*/
+    unsigned int  evtchn = connect->evtchn;
+    unsigned long shmem_frame = connect->shmem_frame;
+    struct vm_struct *vma;
+    pgprot_t      prot;
+    int           error;
+    blkif_t      *blkif;
+
+    DPRINTK("PT got BE_CONNECT\n");
+
+    blkif = &ptfe_blkif; /* for convenience if the hash is readded later. */
+
+    if ( (vma = get_vm_area(PAGE_SIZE, VM_IOREMAP)) == NULL )
+    {
+        connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        return;
+    }
+
+    prot = __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED);
+    error = direct_remap_area_pages(&init_mm, VMALLOC_VMADDR(vma->addr),
+                                    shmem_frame<<PAGE_SHIFT, PAGE_SIZE,
+                                    prot, domid);
+    if ( error != 0 )
+    {
+        WPRINTK("BE_CONNECT: error! (%d)\n", error);
+        if ( error == -ENOMEM ) 
+            connect->status = BLKIF_BE_STATUS_OUT_OF_MEMORY;
+        else if ( error == -EFAULT ) {
+            connect->status = BLKIF_BE_STATUS_MAPPING_ERROR;
+            WPRINTK("BE_CONNECT: MAPPING error!\n");
+        }
+        else
+            connect->status = BLKIF_BE_STATUS_ERROR;
+        vfree(vma->addr);
+        return;
+    }
+
+    if ( blkif->status != DISCONNECTED )
+    {
+        connect->status = BLKIF_BE_STATUS_INTERFACE_CONNECTED;
+        vfree(vma->addr);
+        return;
+    }
+
+    blkif->evtchn        = evtchn;
+    blkif->irq           = bind_evtchn_to_irq(evtchn);
+    blkif->shmem_frame   = shmem_frame;
+    blkif->blk_ring_base = (blkif_ring_t *)vma->addr;
+    blkif->status        = CONNECTED;
+    /*blkif_get(blkif);*/
+
+    request_irq(blkif->irq, blkif_ptfe_int, 0, "blkif-pt-backend", blkif);
+
+    connect->status = BLKIF_BE_STATUS_OKAY;
+}
+
+void blkif_ptfe_disconnect(blkif_be_disconnect_t *disconnect)
+{
+    /*
+     * don't actually set the passthrough to disconnected.
+     * We just act as a pipe, and defer to the real ends to handle things like
+     * recovery.
+     */
+
+    DPRINTK("PT got BE_DISCONNECT\n");
+
+    disconnect->status = BLKIF_BE_STATUS_OKAY;
+    return;
+}
+
+/*-----[ Control Messages to/from Backend VM ]----------------------------*/
+
+/* Tell the controller to bring up the interface. */
+static void blkif_ptbe_send_interface_connect(void)
+{
+    ctrl_msg_t cmsg = {
+        .type    = CMSG_BLKIF_FE,
+        .subtype = CMSG_BLKIF_FE_INTERFACE_CONNECT,
+        .length  = sizeof(blkif_fe_interface_connect_t),
+    };
+    blkif_fe_interface_connect_t *msg = (void*)cmsg.msg;
+    msg->handle      = 0;
+    msg->shmem_frame = virt_to_machine(blk_ptbe_ring) >> PAGE_SHIFT;
+    
+    ctrl_if_send_message_block(&cmsg, NULL, 0, TASK_UNINTERRUPTIBLE);
+}
+
+static void blkif_ptbe_close(void)
+{
+}
+
+/* Move from CLOSED to DISCONNECTED state. */
+static void blkif_ptbe_disconnect(void)
+{
+    blk_ptbe_ring = (blkif_ring_t *)__get_free_page(GFP_KERNEL);
+    blk_ptbe_ring->req_prod = blk_ptbe_ring->resp_prod 
+                            = ptbe_resp_cons = ptbe_req_prod = 0;
+    blkif_pt_state  = BLKIF_STATE_DISCONNECTED;
+    DPRINTK("Blkif-Passthrough-BE is now DISCONNECTED.\n");
+    blkif_ptbe_send_interface_connect();
+}
+
+static void blkif_ptbe_connect(blkif_fe_interface_status_t *status)
+{
+    int err = 0;
+    
+    blkif_ptbe_evtchn = status->evtchn;
+    blkif_ptbe_irq    = bind_evtchn_to_irq(blkif_ptbe_evtchn);
+
+    err = request_irq(blkif_ptbe_irq, blkif_ptbe_int, 
+                      SA_SAMPLE_RANDOM, "blkif", NULL);
+    if ( err ) {
+       WPRINTK("blkfront request_irq failed (%d)\n", err);
+        return;
+    } else {
+       /* transtion to connected in case we need to do a 
+           a partion probe on a whole disk */
+        blkif_pt_state = BLKIF_STATE_CONNECTED;
+    }
+}
+
+static void unexpected(blkif_fe_interface_status_t *status)
+{
+    WPRINTK(" TAP: Unexpected blkif status %s in state %s\n", 
+           blkif_status_name[status->status],
+           blkif_state_name[blkif_pt_state]);
+}
+
+static void blkif_ptbe_status(
+    blkif_fe_interface_status_t *status)
+{
+    if ( status->handle != 0 )
+    {
+        DPRINTK("Status change on unsupported blkif %d\n",
+               status->handle);
+        return;
+    }
+
+    DPRINTK("ptbe_status: got %s\n", blkif_status_name[status->status]);
+    
+    switch ( status->status )
+    {
+    case BLKIF_INTERFACE_STATUS_CLOSED:
+        switch ( blkif_pt_state )
+        {
+        case BLKIF_STATE_CLOSED:
+            unexpected(status);
+            break;
+        case BLKIF_STATE_DISCONNECTED:
+        case BLKIF_STATE_CONNECTED:
+            unexpected(status);
+            blkif_ptbe_close();
+            break;
+        }
+        break;
+        
+    case BLKIF_INTERFACE_STATUS_DISCONNECTED:
+        switch ( blkif_pt_state )
+        {
+        case BLKIF_STATE_CLOSED:
+            blkif_ptbe_disconnect();
+            break;
+        case BLKIF_STATE_DISCONNECTED:
+        case BLKIF_STATE_CONNECTED:
+            printk(KERN_ALERT "*** add recovery code to the tap driver. ***\n");
+            unexpected(status);
+            break;
+        }
+        break;
+        
+    case BLKIF_INTERFACE_STATUS_CONNECTED:
+        switch ( blkif_pt_state )
+        {
+        case BLKIF_STATE_CLOSED:
+            unexpected(status);
+            blkif_ptbe_disconnect();
+            blkif_ptbe_connect(status);
+            break;
+        case BLKIF_STATE_DISCONNECTED:
+            blkif_ptbe_connect(status);
+            break;
+        case BLKIF_STATE_CONNECTED:
+            unexpected(status);
+            blkif_ptbe_connect(status);
+            break;
+        }
+        break;
+
+   case BLKIF_INTERFACE_STATUS_CHANGED:
+        switch ( blkif_pt_state )
+        {
+        case BLKIF_STATE_CLOSED:
+        case BLKIF_STATE_DISCONNECTED:
+            unexpected(status);
+            break;
+        case BLKIF_STATE_CONNECTED:
+            /* vbd_update(); */
+            /* tap doesn't really get state changes... */
+            unexpected(status);
+            break;
+        }
+       break;
+       
+    default:
+        DPRINTK("Status change to unknown value %d\n", status->status);
+        break;
+    }
+}
+
+/*-----[ All control messages enter here: ]-------------------------------*/
+
+void blkif_ctrlif_rx(ctrl_msg_t *msg, unsigned long id)
+{
+    switch ( msg->type )
+    {
+    case CMSG_BLKIF_FE:
+
+        switch ( msg->subtype )
+        {
+        case CMSG_BLKIF_FE_INTERFACE_STATUS:
+            if ( msg->length != sizeof(blkif_fe_interface_status_t) )
+                goto parse_error;
+            blkif_ptbe_status((blkif_fe_interface_status_t *) &msg->msg[0]);
+            break;        
+
+        default:
+            goto parse_error;
+        }
+
+    case CMSG_BLKIF_BE:
+        
+        switch ( msg->subtype )
+        {
+        case CMSG_BLKIF_BE_CREATE:
+            if ( msg->length != sizeof(blkif_be_create_t) )
+                goto parse_error;
+            blkif_ptfe_create((blkif_be_create_t *)&msg->msg[0]);
+            break; 
+        case CMSG_BLKIF_BE_DESTROY:
+            if ( msg->length != sizeof(blkif_be_destroy_t) )
+                goto parse_error;
+            blkif_ptfe_destroy((blkif_be_destroy_t *)&msg->msg[0]);
+            break;        
+        case CMSG_BLKIF_BE_CONNECT:
+            if ( msg->length != sizeof(blkif_be_connect_t) )
+                goto parse_error;
+            blkif_ptfe_connect((blkif_be_connect_t *)&msg->msg[0]);
+            break;        
+        case CMSG_BLKIF_BE_DISCONNECT:
+            if ( msg->length != sizeof(blkif_be_disconnect_t) )
+                goto parse_error;
+            blkif_ptfe_disconnect((blkif_be_disconnect_t *)&msg->msg[0]);
+            break;        
+
+        /* We just ignore anything to do with vbds for now. */
+        
+        case CMSG_BLKIF_BE_VBD_CREATE:
+            DPRINTK("PT got VBD_CREATE\n");
+            ((blkif_be_vbd_create_t *)&msg->msg[0])->status 
+                = BLKIF_BE_STATUS_OKAY;
+            break;
+        case CMSG_BLKIF_BE_VBD_DESTROY:
+            DPRINTK("PT got VBD_DESTROY\n");
+            ((blkif_be_vbd_destroy_t *)&msg->msg[0])->status
+                = BLKIF_BE_STATUS_OKAY;
+            break;
+        case CMSG_BLKIF_BE_VBD_GROW:
+            DPRINTK("PT got VBD_GROW\n");
+            ((blkif_be_vbd_grow_t *)&msg->msg[0])->status
+                = BLKIF_BE_STATUS_OKAY;
+            break;
+        case CMSG_BLKIF_BE_VBD_SHRINK:
+            DPRINTK("PT got VBD_SHRINK\n");
+            ((blkif_be_vbd_shrink_t *)&msg->msg[0])->status
+                = BLKIF_BE_STATUS_OKAY;
+            break;
+        default:
+            goto parse_error;
+        }
+    }
+
+    ctrl_if_send_response(msg);
+    return;
+
+ parse_error:
+    msg->length = 0;
+    ctrl_if_send_response(msg);
+}
diff --git a/linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap_datapath.c b/linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap_datapath.c
new file mode 100644 (file)
index 0000000..c8733dc
--- /dev/null
@@ -0,0 +1,517 @@
+/******************************************************************************
+ * blktap_datapath.c
+ * 
+ * XenLinux virtual block-device tap.
+ * Block request routing data path.
+ * 
+ * Copyright (c) 2004, Andrew Warfield
+ *
+ */
+#include "blktap.h"
+
+/*-----[ The data paths ]-------------------------------------------------*/
+/* Connections to the frontend domains.*/
+blkif_t   ptfe_blkif; 
+/* Connection to a single backend domain. */
+blkif_ring_t *blk_ptbe_ring;   /* Ring from the PT to the BE dom    */ 
+BLKIF_RING_IDX ptbe_resp_cons; /* Response consumer for comms ring. */
+BLKIF_RING_IDX ptbe_req_prod;  /* Private request producer.         */
+
+/* Rings up to user space. */ 
+blkif_req_ring_t fe_ring;// = BLKIF_REQ_RING_INIT;
+blkif_rsp_ring_t be_ring;// = BLKIF_RSP_RING_INIT;
+
+/*-----[ Ring helpers ]---------------------------------------------------*/
+
+inline int BLKTAP_RING_FULL(blkif_generic_ring_t *ring)
+{
+    if (ring->type == BLKIF_REQ_RING_TYPE) {
+        blkif_req_ring_t *r = (blkif_req_ring_t *)ring;
+        return ( ( r->req_prod - r->rsp_cons ) == BLKIF_RING_SIZE );
+    }
+    
+    /* for now assume that there is always room in the response path. */
+    return 0;
+}
+
+/*-----[ Tracking active requests ]---------------------------------------*/
+
+/* this must be the same as MAX_PENDING_REQS in blkback.c */
+#define MAX_ACTIVE_REQS 64
+
+active_req_t  active_reqs[MAX_ACTIVE_REQS];
+unsigned char active_req_ring[MAX_ACTIVE_REQS];
+spinlock_t    active_req_lock = SPIN_LOCK_UNLOCKED;
+typedef unsigned int ACTIVE_RING_IDX;
+ACTIVE_RING_IDX active_prod, active_cons;
+#define MASK_ACTIVE_IDX(_i) ((_i)&(MAX_ACTIVE_REQS-1))
+#define ACTIVE_IDX(_ar) (_ar - active_reqs)
+
+inline active_req_t *get_active_req(void) 
+{
+    ASSERT(active_cons != active_prod);    
+    return &active_reqs[MASK_ACTIVE_IDX(active_cons++)];
+}
+
+inline void free_active_req(active_req_t *ar) 
+{
+    unsigned long flags;
+        
+    spin_lock_irqsave(&active_req_lock, flags);
+    active_req_ring[MASK_ACTIVE_IDX(active_prod++)] = ACTIVE_IDX(ar);
+    spin_unlock_irqrestore(&active_req_lock, flags);
+}
+
+inline void active_reqs_init(void)
+{
+    ACTIVE_RING_IDX i;
+    
+    active_cons = 0;
+    active_prod = MAX_ACTIVE_REQS;
+    memset(active_reqs, 0, sizeof(active_reqs));
+    for ( i = 0; i < MAX_ACTIVE_REQS; i++ )
+        active_req_ring[i] = i;
+}
+
+/*-----[ Data to/from Frontend (client) VMs ]-----------------------------*/
+
+irqreturn_t blkif_ptfe_int(int irq, void *dev_id, struct pt_regs *regs)
+{
+    /* we have pending messages from the real frontend. */
+
+    blkif_request_t *req_s, *req_d;
+    BLKIF_RING_IDX fe_rp;
+    unsigned long flags;
+    int notify;
+    unsigned long i;
+    active_req_t *ar;
+    
+    DPRINTK("PT got FE interrupt.\n");
+    
+    /* lock both rings */
+    spin_lock_irqsave(&blkif_io_lock, flags);
+
+    /* While there are REQUESTS on FERing: */
+    fe_rp = ptfe_blkif.blk_ring_base->req_prod;
+    rmb();
+    notify = (ptfe_blkif.blk_req_cons != fe_rp);
+
+    for (i = ptfe_blkif.blk_req_cons; i != fe_rp; i++) {
+
+        /* Get the next request */
+        req_s = &ptfe_blkif.blk_ring_base->ring[MASK_BLKIF_IDX(i)].req;
+        
+        /* This is a new request:  
+         * Assign an active request record, and remap the id. 
+         */
+        ar = get_active_req();
+        ar->id = req_s->id;
+        req_s->id = ACTIVE_IDX(ar);
+        DPRINTK("%3lu < %3lu\n", req_s->id, ar->id);
+
+        /* FE -> BE interposition point is here. */
+        
+        /* ------------------------------------------------------------- */
+        /* BLKIF_OP_PROBE_HACK:                                          */
+        /* Until we have grant tables, we need to allow the backent to   */
+        /* map pages that are either from this domain, or more commonly  */
+        /* from the real front end.  We achieve this in a terrible way,  */
+        /* by passing the front end's domid allong with PROBE messages   */
+        /* Once grant tables appear, this should all go away.            */
+
+        if (req_s->operation == BLKIF_OP_PROBE) {
+            DPRINTK("Adding FE domid to PROBE request.\n");
+            (domid_t)(req_s->frame_and_sects[1]) = ptfe_blkif.domid;
+        }
+
+        /* ------------------------------------------------------------- */
+
+        /* If we are in MODE_INTERCEPT_FE or MODE_COPY_FE: */
+        if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
+             (blktap_mode & BLKTAP_MODE_COPY_FE) ) {
+            
+            /* Copy the response message to UFERing */
+            /* In MODE_INTERCEPT_FE, map attached pages into the app vma */
+            /* In MODE_COPY_FE_PAGES, copy attached pages into the app vma */
+
+            /* XXX: mapping/copying of attached pages is still not done! */
+
+            DPRINTK("req->UFERing\n"); 
+            blktap_write_fe_ring(req_s);
+
+
+        }
+
+        /* If we are not in MODE_INTERCEPT_FE or MODE_INTERCEPT_BE: */
+        if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
+               (blktap_mode & BLKTAP_MODE_INTERCEPT_BE)) ) {
+            
+            /* be included to prevent noise from the fe when its off */
+            /* copy the request message to the BERing */
+
+            DPRINTK("blktap: FERing[%u] -> BERing[%u]\n", 
+                    (unsigned)MASK_BLKIF_IDX(i), 
+                    (unsigned)MASK_BLKIF_IDX(ptbe_req_prod));
+
+            req_d = &blk_ptbe_ring->ring[MASK_BLKIF_IDX(ptbe_req_prod)].req;
+            
+            memcpy(req_d, req_s, sizeof(blkif_request_t));
+
+            ptbe_req_prod++;
+        }
+    }
+
+    ptfe_blkif.blk_req_cons = i;
+
+    /* If we have forwarded any responses, notify the appropriate ends. */
+    if (notify) {
+
+        /* we have sent stuff to the be, notify it. */
+        if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
+               (blktap_mode & BLKTAP_MODE_INTERCEPT_BE)) ) {
+            wmb();
+            blk_ptbe_ring->req_prod = ptbe_req_prod;
+
+            notify_via_evtchn(blkif_ptbe_evtchn);
+            DPRINTK(" -- and notified.\n");
+        }
+
+        /* we sent stuff to the app, notify it. */
+        if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) ||
+             (blktap_mode & BLKTAP_MODE_COPY_FE) ) {
+
+            blktap_kick_user();
+        }
+    }
+
+    /* unlock rings */
+    spin_unlock_irqrestore(&blkif_io_lock, flags);
+
+    return IRQ_HANDLED;
+}
+
+inline int write_req_to_be_ring(blkif_request_t *req)
+{
+    blkif_request_t *req_d;
+
+    req_d = &blk_ptbe_ring->ring[MASK_BLKIF_IDX(ptbe_req_prod)].req;
+    memcpy(req_d, req, sizeof(blkif_request_t));
+    ptbe_req_prod++;
+
+    return 0;
+}
+
+inline void kick_be_domain(void) {
+    wmb();
+    blk_ptbe_ring->req_prod = ptbe_req_prod;
+    notify_via_evtchn(blkif_ptbe_evtchn);
+}
+
+/*-----[ Data to/from Backend (server) VM ]------------------------------*/
+
+
+irqreturn_t blkif_ptbe_int(int irq, void *dev_id, 
+                                  struct pt_regs *ptregs)
+{
+    blkif_response_t  *resp_s, *resp_d;
+    BLKIF_RING_IDX be_rp;
+    unsigned long flags;
+    int notify;
+    unsigned long i;
+    active_req_t *ar;
+
+    DPRINTK("PT got BE interrupt.\n");
+
+    /* lock both rings */
+    spin_lock_irqsave(&blkif_io_lock, flags);
+    
+    /* While there are RESPONSES on BERing: */
+    be_rp = blk_ptbe_ring->resp_prod;
+    rmb();
+    notify = (ptbe_resp_cons != be_rp);
+    
+    for ( i = ptbe_resp_cons; i != be_rp; i++ )
+    {
+        /* BE -> FE interposition point is here. */
+        
+        /* Get the next response */
+        resp_s = &blk_ptbe_ring->ring[MASK_BLKIF_IDX(i)].resp;
+    
+       
+        /* If we are in MODE_INTERCEPT_BE or MODE_COPY_BE: */
+        if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
+             (blktap_mode & BLKTAP_MODE_COPY_BE) ) {
+
+            /* Copy the response message to UBERing */
+            /* In MODE_INTERCEPT_BE, map attached pages into the app vma */
+            /* In MODE_COPY_BE_PAGES, copy attached pages into the app vma */
+
+            /* XXX: copy/map the attached page! */
+
+            DPRINTK("rsp->UBERing\n"); 
+            blktap_write_be_ring(resp_s);
+
+        }
+       
+        /* If we are NOT in MODE_INTERCEPT_BE or MODE_INTERCEPT_FE: */
+        if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
+               (blktap_mode & BLKTAP_MODE_INTERCEPT_FE)) ) {
+            
+            /* (fe included to prevent random interference from the BE) */
+            /* Copy the response message to FERing */
+         
+            DPRINTK("blktap: BERing[%u] -> FERing[%u]\n", 
+                    (unsigned) MASK_BLKIF_IDX(i), 
+                    (unsigned) MASK_BLKIF_IDX(ptfe_blkif.blk_resp_prod));
+
+            /* remap id, and free the active req. blkif lookup goes here too.*/
+            ar = &active_reqs[resp_s->id];
+            DPRINTK("%3lu > %3lu\n", resp_s->id, ar->id);
+            resp_s->id = ar->id;
+            free_active_req(ar);
+           
+            resp_d = &ptfe_blkif.blk_ring_base->ring[
+                MASK_BLKIF_IDX(ptfe_blkif.blk_resp_prod)].resp;
+
+            memcpy(resp_d, resp_s, sizeof(blkif_response_t));
+            
+            ptfe_blkif.blk_resp_prod++;
+
+        }
+    }
+
+    ptbe_resp_cons = i;
+    
+    /* If we have forwarded any responses, notify the apropriate domains. */
+    if (notify) {
+
+        /* we have sent stuff to the fe.  notify it. */
+        if ( !((blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
+               (blktap_mode & BLKTAP_MODE_INTERCEPT_FE)) ) {
+            wmb();
+            ptfe_blkif.blk_ring_base->resp_prod = ptfe_blkif.blk_resp_prod;
+        
+            notify_via_evtchn(ptfe_blkif.evtchn);
+            DPRINTK(" -- and notified.\n");
+        }
+
+        /* we sent stuff to the app, notify it. */
+        if ( (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) ||
+             (blktap_mode & BLKTAP_MODE_COPY_BE) ) {
+
+            blktap_kick_user();
+        }
+    }
+
+    spin_unlock_irqrestore(&blkif_io_lock, flags);
+    return IRQ_HANDLED;
+}
+
+inline int write_resp_to_fe_ring(blkif_response_t *rsp)
+{
+    blkif_response_t *resp_d;
+    active_req_t *ar;
+    
+    /* remap id, and free the active req. blkif lookup goes here too.*/
+    ar = &active_reqs[rsp->id];
+    DPRINTK("%3lu > %3lu\n", rsp->id, ar->id);
+    rsp->id = ar->id;
+    free_active_req(ar);
+            
+    resp_d = &ptfe_blkif.blk_ring_base->ring[
+        MASK_BLKIF_IDX(ptfe_blkif.blk_resp_prod)].resp;
+
+    memcpy(resp_d, rsp, sizeof(blkif_response_t));
+    ptfe_blkif.blk_resp_prod++;
+
+    return 0;
+}
+
+inline void kick_fe_domain(void) {
+    wmb();
+    ptfe_blkif.blk_ring_base->resp_prod = ptfe_blkif.blk_resp_prod;
+    notify_via_evtchn(ptfe_blkif.evtchn);
+    
+}
+
+static inline void flush_requests(void)
+{
+    wmb(); /* Ensure that the frontend can see the requests. */
+    blk_ptbe_ring->req_prod = ptbe_req_prod;
+    notify_via_evtchn(blkif_ptbe_evtchn);
+}
+
+/*-----[ Data to/from user space ]----------------------------------------*/
+
+
+int blktap_write_fe_ring(blkif_request_t *req)
+{
+    blkif_request_t *target;
+    int error, i;
+
+    /*
+     * This is called to pass a request from the real frontend domain's
+     * blkif ring to the character device.
+     */
+
+    if ( ! blktap_ring_ok ) {
+        DPRINTK("blktap: fe_ring not ready for a request!\n");
+        return 0;
+    }
+
+    if ( BLKTAP_RING_FULL(RING(&fe_ring)) ) {
+        DPRINTK("blktap: fe_ring is full, can't add.\n");
+        return 0;
+    }
+
+    target = &fe_ring.ring->ring[MASK_BLKIF_IDX(fe_ring.req_prod)].req;
+    memcpy(target, req, sizeof(*req));
+
+/* maybe move this stuff out into a seperate func ------------------- */
+
+    /*
+     * For now, map attached page into a fixed position into the vma.
+     * XXX: make this map to a free page.
+     */
+
+    /* Attempt to map the foreign pages directly in to the application */
+    for (i=0; i<target->nr_segments; i++) {
+
+        /* get an unused virtual address from the char device */
+        /* store the old page address */
+        /* replace the address with the virtual address */
+
+        /* blktap_vma->vm_start+((2+i)*PAGE_SIZE) */
+
+        error = direct_remap_area_pages(blktap_vma->vm_mm, 
+                                        MMAP_VADDR(req->id, i), 
+                                        target->frame_and_sects[0] & PAGE_MASK,
+                                        PAGE_SIZE,
+                                        blktap_vma->vm_page_prot,
+                                        ptfe_blkif.domid);
+        if ( error != 0 ) {
+            printk(KERN_INFO "remapping attached page failed! (%d)\n", error);
+            return 0;
+        }
+    }
+    /* fix the address of the attached page in the message. */
+    /* TODO:      preserve the segment number stuff here... */
+    /* target->frame_and_sects[0] = blktap_vma->vm_start + PAGE_SIZE;*/
+/* ------------------------------------------------------------------ */
+
+    
+    fe_ring.req_prod++;
+
+    return 0;
+}
+
+int blktap_write_be_ring(blkif_response_t *rsp)
+{
+    blkif_response_t *target;
+
+    /*
+     * This is called to pass a request from the real backend domain's
+     * blkif ring to the character device.
+     */
+
+    if ( ! blktap_ring_ok ) {
+        DPRINTK("blktap: be_ring not ready for a request!\n");
+        return 0;
+    }
+
+    if ( BLKTAP_RING_FULL(RING(&be_ring)) ) {
+        DPRINTK("blktap: be_ring is full, can't add.\n");
+        return 0;
+    }
+
+    target = &be_ring.ring->ring[MASK_BLKIF_IDX(be_ring.rsp_prod)].resp;
+    memcpy(target, rsp, sizeof(*rsp));
+
+
+    /* XXX: map attached pages and fix-up addresses in the copied address. */
+
+    be_ring.rsp_prod++;
+
+    return 0;
+}
+
+int blktap_read_fe_ring(void)
+{
+    /* This is called to read responses from the UFE ring. */
+
+    BLKIF_RING_IDX fe_rp;
+    unsigned long i;
+    int notify;
+
+    DPRINTK("blktap_read_fe_ring()\n");
+
+    fe_rp = fe_ring.ring->resp_prod;
+    rmb();
+    notify = (fe_rp != fe_ring.rsp_cons);
+
+    /* if we are forwarding from UFERring to FERing */
+    if (blktap_mode & BLKTAP_MODE_INTERCEPT_FE) {
+
+        /* for each outstanding message on the UFEring  */
+        for ( i = fe_ring.rsp_cons; i != fe_rp; i++ ) {
+
+            /* XXX: remap pages on that message as necessary */
+            /* copy the message to the UBEring */
+
+            DPRINTK("resp->fe_ring\n");
+            write_resp_to_fe_ring(&fe_ring.ring->ring[MASK_BLKIF_IDX(i)].resp);
+        }
+    
+        fe_ring.rsp_cons = fe_rp;
+
+        /* notify the fe if necessary */
+        if ( notify ) {
+            DPRINTK("kick_fe_domain()\n");
+            kick_fe_domain();
+        }
+    }
+
+    return 0;
+}
+
+int blktap_read_be_ring(void)
+{
+    /* This is called to read responses from the UBE ring. */
+
+    BLKIF_RING_IDX be_rp;
+    unsigned long i;
+    int notify;
+
+    DPRINTK("blktap_read_be_ring()\n");
+
+    be_rp = be_ring.ring->req_prod;
+    rmb();
+    notify = (be_rp != be_ring.req_cons);
+
+    /* if we are forwarding from UFERring to FERing */
+    if (blktap_mode & BLKTAP_MODE_INTERCEPT_BE) {
+
+        /* for each outstanding message on the UFEring  */
+        for ( i = be_ring.req_cons; i != be_rp; i++ ) {
+
+            /* XXX: remap pages on that message as necessary */
+            /* copy the message to the UBEring */
+
+            DPRINTK("req->be_ring\n");
+            write_req_to_be_ring(&be_ring.ring->ring[MASK_BLKIF_IDX(i)].req);
+        }
+    
+        be_ring.req_cons = be_rp;
+
+        /* notify the fe if necessary */
+        if ( notify ) {
+            DPRINTK("kick_be_domain()\n");
+            kick_be_domain();
+        }
+    }
+
+    return 0;
+}
diff --git a/linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap_userdev.c b/linux-2.6.9-xen-sparse/drivers/xen/blktap/blktap_userdev.c
new file mode 100644 (file)
index 0000000..c10e3f3
--- /dev/null
@@ -0,0 +1,243 @@
+/******************************************************************************
+ * blktap_userdev.c
+ * 
+ * XenLinux virtual block-device tap.
+ * Control interface between the driver and a character device.
+ * 
+ * Copyright (c) 2004, Andrew Warfield
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/miscdevice.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/gfp.h>
+#include <linux/poll.h>
+#include <asm/pgalloc.h>
+
+#include "blktap.h"
+
+
+unsigned long blktap_mode = BLKTAP_MODE_PASSTHROUGH;
+
+/* Only one process may open /dev/xen/blktap at any time. */
+static unsigned long blktap_dev_inuse;
+unsigned long blktap_ring_ok; /* make this ring->state */
+
+/* for poll: */
+static wait_queue_head_t blktap_wait;
+
+/* Where things are inside the device mapping. */
+struct vm_area_struct *blktap_vma;
+unsigned long mmap_vstart;
+unsigned long rings_vstart;
+
+/* -------[ blktap vm ops ]------------------------------------------- */
+
+static struct page *blktap_nopage(struct vm_area_struct *vma,
+                                             unsigned long address,
+                                             int *type)
+{
+    /*
+     * if the page has not been mapped in by the driver then generate
+     * a SIGBUS to the domain.
+     */
+
+    force_sig(SIGBUS, current);
+
+    return 0;
+}
+
+struct vm_operations_struct blktap_vm_ops = {
+    nopage:   blktap_nopage,
+};
+
+/* -------[ blktap file ops ]----------------------------------------- */
+
+static int blktap_open(struct inode *inode, struct file *filp)
+{
+    if ( test_and_set_bit(0, &blktap_dev_inuse) )
+        return -EBUSY;
+
+    printk(KERN_ALERT "blktap open.\n");
+
+    /* Allocate the fe ring. */
+    fe_ring.ring = (blkif_ring_t *)get_zeroed_page(GFP_KERNEL);
+    if (fe_ring.ring == NULL)
+        goto fail_nomem;
+
+    SetPageReserved(virt_to_page(fe_ring.ring));
+    
+    fe_ring.ring->req_prod = fe_ring.ring->resp_prod
+                           = fe_ring.req_prod
+                           = fe_ring.rsp_cons
+                           = 0;
+
+    /* Allocate the be ring. */
+    be_ring.ring = (blkif_ring_t *)get_zeroed_page(GFP_KERNEL);
+    if (be_ring.ring == NULL)
+        goto fail_free_fe;
+
+    SetPageReserved(virt_to_page(be_ring.ring));
+    
+    be_ring.ring->req_prod = be_ring.ring->resp_prod
+                           = be_ring.rsp_prod
+                           = be_ring.req_cons
+                           = 0;
+
+    DPRINTK(KERN_ALERT "blktap open.\n");
+
+    return 0;
+
+ fail_free_fe:
+    free_page( (unsigned long) fe_ring.ring);
+
+ fail_nomem:
+    return -ENOMEM;
+}
+
+static int blktap_release(struct inode *inode, struct file *filp)
+{
+    blktap_dev_inuse = 0;
+    blktap_ring_ok = 0;
+
+    printk(KERN_ALERT "blktap closed.\n");
+
+    /* Free the ring page. */
+    ClearPageReserved(virt_to_page(fe_ring.ring));
+    free_page((unsigned long) fe_ring.ring);
+
+    ClearPageReserved(virt_to_page(be_ring.ring));
+    free_page((unsigned long) be_ring.ring);
+    
+    return 0;
+}
+
+static int blktap_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+    int size;
+
+    printk(KERN_ALERT "blktap mmap (%lx, %lx)\n",
+           vma->vm_start, vma->vm_end);
+
+    vma->vm_ops = &blktap_vm_ops;
+
+    size = vma->vm_end - vma->vm_start;
+    if ( size != ( (MMAP_PAGES + RING_PAGES) << PAGE_SHIFT ) ) {
+        printk(KERN_INFO 
+               "blktap: you _must_ map exactly %d pages!\n",
+               MMAP_PAGES + RING_PAGES);
+        return -EAGAIN;
+    }
+
+    size >>= PAGE_SHIFT;
+    printk(KERN_INFO "blktap: 2 rings + %d pages.\n", size-1);
+    
+    rings_vstart = vma->vm_start;
+    mmap_vstart  = rings_vstart + (RING_PAGES << PAGE_SHIFT);
+    
+    /* Map the ring pages to the start of the region and reserve it. */
+
+    /* not sure if I really need to do this... */
+    vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+    DPRINTK("Mapping be_ring page %lx.\n", __pa(be_ring.ring));
+    if (remap_page_range(vma, vma->vm_start, __pa(be_ring.ring), PAGE_SIZE, 
+                         vma->vm_page_prot)) {
+        printk(KERN_ERR "be_ring: remap_page_range failure!\n");
+    }
+
+    DPRINTK("Mapping fe_ring page %lx.\n", __pa(fe_ring.ring));
+    if (remap_page_range(vma, vma->vm_start + PAGE_SIZE, __pa(fe_ring.ring), 
+                         PAGE_SIZE, vma->vm_page_prot)) {
+        printk(KERN_ERR "fe_ring: remap_page_range failure!\n");
+    }
+
+    blktap_vma = vma;
+    blktap_ring_ok = 1;
+
+    return 0;
+}
+
+static int blktap_ioctl(struct inode *inode, struct file *filp,
+                        unsigned int cmd, unsigned long arg)
+{
+    switch(cmd) {
+    case BLKTAP_IOCTL_KICK_FE: /* There are fe messages to process. */
+        return blktap_read_fe_ring();
+
+    case BLKTAP_IOCTL_KICK_BE: /* There are be messages to process. */
+        return blktap_read_be_ring();
+
+    case BLKTAP_IOCTL_SETMODE:
+        if (BLKTAP_MODE_VALID(arg)) {
+            blktap_mode = arg;
+            /* XXX: may need to flush rings here. */
+            printk(KERN_INFO "blktap: set mode to %lx\n", arg);
+            return 0;
+        }
+        /* XXX: return a more meaningful error case here. */
+    }
+    return -ENOIOCTLCMD;
+}
+
+static unsigned int blktap_poll(struct file *file, poll_table *wait)
+{
+        poll_wait(file, &blktap_wait, wait);
+
+        if ( (fe_ring.req_prod != fe_ring.ring->req_prod) ||
+             (be_ring.rsp_prod != be_ring.ring->resp_prod) ) {
+
+            fe_ring.ring->req_prod = fe_ring.req_prod;
+            be_ring.ring->resp_prod = be_ring.rsp_prod;
+            return POLLIN | POLLRDNORM;
+        }
+
+        return 0;
+}
+
+void blktap_kick_user(void)
+{
+    /* blktap_ring->req_prod = blktap_req_prod; */
+    wake_up_interruptible(&blktap_wait);
+}
+
+static struct file_operations blktap_fops = {
+    owner:    THIS_MODULE,
+    poll:     blktap_poll,
+    ioctl:    blktap_ioctl,
+    open:     blktap_open,
+    release:  blktap_release,
+    mmap:     blktap_mmap,
+};
+
+/* -------[ blktap module setup ]------------------------------------- */
+
+static struct miscdevice blktap_miscdev = {
+    .minor        = BLKTAP_MINOR,
+    .name         = "blktap",
+    .fops         = &blktap_fops,
+    .devfs_name   = "misc/blktap",
+};
+
+int blktap_init(void)
+{
+    int err;
+
+    err = misc_register(&blktap_miscdev);
+    if ( err != 0 )
+    {
+        printk(KERN_ALERT "Couldn't register /dev/misc/blktap (%d)\n", err);
+        return err;
+    }
+
+    init_waitqueue_head(&blktap_wait);
+
+
+    return 0;
+}